{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#   4.基于概率论的分类方法：朴素贝叶斯\n",
    "###  优点：在数据较少的情况下仍然有效，可以处理多类别问题。\n",
    "###    缺点：对于输入数据的准备方式比较敏感。\n",
    "###    适用数据类型：标称型数据（有限）。\n",
    "###    注释：“朴素”，这里指整个形式化过程只做最原始、最简单的假设。¶"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4.1使用Python进行文本分类"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.1.1 准备数据：从文本中构建词向量"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4-1 词表到向量的转化函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def loadDataSet():\n",
    "    postingList = [['my','dog','has','flea','problems','help','please'],\n",
    "                   ['maybe','not','take','him','to','dog','park','stupid'],\n",
    "                   ['my','dalmation','is','so','cute','I','love','him'],\n",
    "                   ['stop','posting','stupid','worthless','garbage'],\n",
    "                   ['mr','licks','ate','my','steak','how','to','stop','him'],\n",
    "                   ['quit','buying','worthless','dog','food','stupid']]  # 定义邮件列表\n",
    "    classVec = [0,1,0,1,0,1]  # 1 代表侮辱性文字， 0 代表正常言论\n",
    "    return postingList, classVec\n",
    "\n",
    "def createVocabList(dataSet):  # 创建词汇列表\n",
    "    vocabSet = set([])  # 定义词汇集\n",
    "    for document in dataSet:  # 遍历文档\n",
    "        vocabSet = vocabSet | set(document)  # 将每个document合并到vocabSet，|用来联合两个集合\n",
    "    return list(vocabSet)  \n",
    "    \n",
    "def setOfWords2Vec(vocabList, inputSet):  # 把单词转换成向量\n",
    "    returnVec = [0]*len(vocabList)  # 定义要返回的向量\n",
    "    for word in inputSet:   # 遍历输出集中的单词\n",
    "        if word in vocabList:   # 单词在词汇集中\n",
    "            returnVec[vocabList.index(word)] = 1   # 对应的位置设为1\n",
    "        else:\n",
    "            print(\"the word: %s is not in my Vocabulary!\" % word)\n",
    "    return returnVec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bagOfWords2VecMN(vocabList, inputSet):   # 把单词转换成向量,用词袋模型，计算词出现的次数\n",
    "    returnVec = [0]*len(vocabList)   # 定义要返回的向量\n",
    "    for word in inputSet:  # 遍历输出集中的单词\n",
    "        if word in vocabList:  # 单词在词汇集中\n",
    "            returnVec[vocabList.index(word)] += 1    #对应的词出现次数 加1\n",
    "    return returnVec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.1.2 训练算法：从此向量计算概率\n",
    "### 4-2 朴素贝叶斯分类器训练函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trainNB0(trainMatrix, trainCategory):\n",
    "    numTrainDocs = len(trainMatrix)   # 计算文档的数目\n",
    "    numWords = len(trainMatrix[0])   # 计算单词的数目\n",
    "    pAbusive = sum(trainCategory)/float(numTrainDocs)   # 计算类别的概率，abusive为1，not abusive为0\n",
    "    p0Num = zeros(numWords)   # 初始化计数器，p0是not abusive\n",
    "    p1Num = zeros(numWords) # 初始化计数器\n",
    "    p0Denom = 0.0  # 初始化分母\n",
    "    p1Denom = 0.0\n",
    "    for i in range(numTrainDocs):\n",
    "        if trainCategory[i] == 1:   # 计算abusive对应的词汇的数目，trainMatrix为0-1值形成的向量\n",
    "            p1Num += trainMatrix[i]    # p1Num存储的是每个词出现的次数\n",
    "            p1Denom += sum(trainMatrix[i])   # p1Denom存储的是词的总数目\n",
    "        else:\n",
    "            p0Num += trainMatrix[i]   # 每个词在not abusive下出现的次数\n",
    "            p0Denom += sum(trainMatrix[i])   # not abusive下的总词数\n",
    "    p1Vect = p1Num/p1Denom   # change to log()   # 计算abusive下每个词出现的概率\n",
    "    p0Vect = p0Num/p0Denom   # change to log()   # 计算not abusive下每个词出现的概率\n",
    "    return p0Vect,p1Vect,pAbusive"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.1.3 测试算法：根据现实情况修改分类器\n",
    "### 4-3 朴素贝叶斯分类函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy import * \n",
    "def classifyNB(vec2Classify, p0Vec,p1Vec,pClass1):\n",
    "    p1 = sum(vec2Classify * p1Vec) + log(pClass1)  # log()默认以e为底     ### 计算abusive的概率\n",
    "    p0 = sum(vec2Classify * p0Vec) + log(1.0-pClass1)  # 计算not abusive的概率\n",
    "    if p1 > p0:\n",
    "        return 1\n",
    "    else:\n",
    "        return 0\n",
    "    \n",
    "    \n",
    "def testingNB():\n",
    "    listOPosts, listClasses = loadDataSet()   \n",
    "    myVocabList = createVocabList(listOPosts)\n",
    "    trainMat = []\n",
    "    for postinDoc in listOPosts:\n",
    "        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))\n",
    "    p0V, p1V, pAb = trainNB0(array(trainMat),array(listClasses))\n",
    "    testEntry = ['love','my','dalmation']\n",
    "    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))\n",
    "    print(testEntry,\"classified as:\",classifyNB(thisDoc,p0V,p1V,pAb))\n",
    "    \n",
    "    testEntry = ['stupid','garbage']\n",
    "    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))\n",
    "    print(testEntry,\"classified as:\",classifyNB(thisDoc,p0V,p1V,pAb))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['love', 'my', 'dalmation'] classified as: 0\n",
      "['stupid', 'garbage'] classified as: 1\n"
     ]
    }
   ],
   "source": [
    "testingNB()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4.2 示例：使用朴素贝叶斯过滤垃圾邮件"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2.1 准备数据：切分文本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M.L.', 'I', 'have', 'ever', 'laid', 'eyes', 'upon.']\n"
     ]
    }
   ],
   "source": [
    "mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'\n",
    "mySent1 = mySent.split()  #  问题： 标点符号也被当成此的一部分\n",
    "print(mySent1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon', '']\n",
      "['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'eyes', 'upon']\n",
      "['this', 'book', 'is', 'the', 'best', 'book', 'on', 'python', 'or', 'm', 'l', 'i', 'have', 'ever', 'laid', 'eyes', 'upon']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\dell\\appdata\\local\\juliapro-0.6.4.1\\python\\lib\\site-packages\\ipykernel_launcher.py:4: FutureWarning: split() requires a non-empty pattern match.\n",
      "  after removing the cwd from sys.path.\n"
     ]
    }
   ],
   "source": [
    "# 解决标点符号也被当成此的一部分的问题  ———— 使用正则表示式，分隔符是除了 |单词， 数字| 外的任意字符串\n",
    "import re\n",
    "regEx = re.compile('\\\\W*')\n",
    "listOfToken = regEx.split(mySent)\n",
    "print(listOfToken)\n",
    "a = [tok for tok in listOfToken if len(tok) > 0]  # 除去 空字符\n",
    "print(a)\n",
    "b = [tok.lower() for tok in listOfToken if len(tok) > 0]\n",
    "print(b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2.2 测试算法：使用朴素贝叶斯进行交叉验证\n",
    "### 4-5 文件解析以及完整的垃圾邮件测试函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def textParse(bigString):  # 文本解析   ### 输入是字符串，输出是单词列表\n",
    "    import re\n",
    "    listOfTokens = re.split(r'\\W*', bigString)\n",
    "    return [tok.lower() for tok in listOfTokens if len(tok) > 2]\n",
    "\n",
    "def spamTest():\n",
    "    docList = []  # 定义docList文档列表\n",
    "    classList = []  # classList类别列表\n",
    "    fullText = []   # fullText所有文档词汇\n",
    "    for i in range(1,26):\n",
    "        # 定义并读取垃圾邮件文件的词汇分割列表\n",
    "        wordList = textParse(open('email/spam/%d.txt' % i).read())   # !!! 为了避免文件中非法字符的影响，尽量用下一行\n",
    "        #wordList = textParse(open('email/spam/%d.txt' % i, \"rb\").read().decode('GBK','ignore'))\n",
    "        \n",
    "        docList.append(wordList)   # 将词汇列表加到文档列表中\n",
    "        fullText.extend(wordList)   # 将所有词汇列表汇总到fullText中\n",
    "        classList.append(1)   # 文档类别为1，spam\n",
    "        \n",
    "        #wordList = textParse(open('email/ham/%d.txt' % i).read())   # !!! 原书本中 bug 因为有可能文件中存在类似“�”非法字符。\n",
    "        # 提示：UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 199: illegal multibyte sequence\n",
    "        wordList = textParse(open('email/ham/%d.txt' % i,  \"rb\").read().decode('GBK','ignore') )  # 读取非垃圾邮件的文档\n",
    "        \n",
    "        docList.append(wordList)  \n",
    "        fullText.extend(wordList)\n",
    "        classList.append(0)  # 类别为0，非垃圾邮件\n",
    "        \n",
    "    vocabList = createVocabList(docList)\n",
    "    #trainingSet = range(50)  # !!! 原书本中 python2.7   提示：TypeError: 'range' object doesn't support item deletion\n",
    "    #因为是python3中range不返回数组对象，而是返回range对象 \n",
    "    trainingSet = list(range(50))   # 定义训练集的索引和测试集\n",
    "    testSet = []\n",
    "    for i in range(10):   # 随机的选择10个作为测试集\n",
    "        randIndex = int(random.uniform(0,len(trainingSet)))  # 随机索引\n",
    "        testSet.append(trainingSet[randIndex])  # 将随机选择的文档加入到测试集中\n",
    "        del(trainingSet[randIndex])  # 从训练集中删除随机选择的文档\n",
    "    trainMat = []   # 定义训练集的矩阵和类别 \n",
    "    trainClasses = []\n",
    "    for docIndex in trainingSet:    # 遍历训练集，求得先验概率和条件概率\n",
    "        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))   # 将词汇列表变为向量放到trainMat中\n",
    "        trainClasses.append(classList[randIndex])   # 训练集的类别标签\n",
    "    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))    # 计算先验概率，条件概率\n",
    "    errorCount = 0    \n",
    "    for docIndex in testSet:\n",
    "        wordVector = setOfWords2Vec(vocabList,docList[docIndex])   # 将测试集词汇向量化\n",
    "        if classifyNB(array(wordVector), p0V, p1V,pSpam) != classList[docIndex]:   # 对测试数据进行分类\n",
    "            errorCount += 1   # 分类不正确，错误计数加1\n",
    "    print(\"the error rate is:\",float(errorCount)/len(testSet))   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the error rate is: 0.3\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\dell\\appdata\\local\\juliapro-0.6.4.1\\python\\python35.zip\\re.py:203: FutureWarning: split() requires a non-empty pattern match.\n",
      "c:\\users\\dell\\appdata\\local\\juliapro-0.6.4.1\\python\\lib\\site-packages\\ipykernel_launcher.py:17: RuntimeWarning: invalid value encountered in true_divide\n",
      "c:\\users\\dell\\appdata\\local\\juliapro-0.6.4.1\\python\\lib\\site-packages\\ipykernel_launcher.py:4: RuntimeWarning: divide by zero encountered in log\n",
      "  after removing the cwd from sys.path.\n"
     ]
    }
   ],
   "source": [
    "spamTest()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
